knitr::opts_chunk$set(
fig.height = 2.5,
fig.width = 4.2,
message = FALSE,
warning = FALSE
)
library(tidyverse)
library(magrittr)
library(brms)
library(tidybayes)
library(bayesplot)
devtools::load_all()
The binomial distribution is denoted \(y \sim Binomial(n, p)\). \(y\) is a count, \(p\) is the probability any particular “trial” is a success, and \(n\) is the number of trials. The come in two flavors that both use the logit link function.
Lets prepare data from a chimpanzee social experiment. We use the recipes package to explicitly create less than full rank dummy variables.
library(recipes)
library(rethinking)
data(chimpanzees)
d <- as_data_frame(chimpanzees) %>%
select(pulled_left, prosoc_left, condition, actor)
rm(chimpanzees)
detach("package:rethinking")
d <- d %>%
mutate(
treatment = 1 + prosoc_left + 2 * condition,
treatment = case_when(
treatment == 1 ~ "R/N",
treatment == 2 ~ "L/N",
treatment == 3 ~ "R/P",
treatment == 4 ~ "L/P"),
actor = as.factor(actor))
df <- recipe(pulled_left ~ 0 + treatment + actor, data = d) %>%
step_dummy(treatment, actor, one_hot = TRUE) %>%
prep() %>%
bake(new_data = d)
df %>%
skimr::skim()
Skim summary statistics
n obs: 504
n variables: 12
── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
pulled_left 0 504 504 0.58 0.49 0 0 1 1 1 ▆▁▁▁▁▁▁▇
── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
actor_X1 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
actor_X2 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
actor_X3 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
actor_X4 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
actor_X5 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
actor_X6 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
actor_X7 0 504 504 0.14 0.35 0 0 0 0 1 ▇▁▁▁▁▁▁▁
treatment_L.N 0 504 504 0.25 0.43 0 0 0 0.25 1 ▇▁▁▁▁▁▁▂
treatment_L.P 0 504 504 0.25 0.43 0 0 0 0.25 1 ▇▁▁▁▁▁▁▂
treatment_R.N 0 504 504 0.25 0.43 0 0 0 0.25 1 ▇▁▁▁▁▁▁▂
treatment_R.P 0 504 504 0.25 0.43 0 0 0 0.25 1 ▇▁▁▁▁▁▁▂
Remember, our coefficients ultimately need to be interpreted in the space on the other side of the link function. What would a flat prior that we used in the gaussian world, like \(normal(0, 10)\), mean in the logistic world? Well too much of that density would be piled up outside the bounds of probability making anything but 1 and 0 probable. A flat prior in the logit space is not a flat prior in the outcome probability space.
rnorm(1e4, 0, 10) %>%
inv_logit_scaled() %>%
tibble::enframe() %>%
ggplot(aes(value)) +
geom_density(fill = palette[7], colour = "transparent") +
theme_burgyl() +
ggtitle("Improper prior - normal(0, 10)",
"Priors need to be callibrated to the outcome probability space")
Lets try something more reasonable.
rnorm(1e4, 0, 1.5) %>%
inv_logit_scaled() %>%
tibble::enframe() %>%
ggplot(aes(value)) +
geom_density(fill = palette[7], colour = "transparent") +
theme_burgyl() +
ggtitle("A more proper prior", "normal(0, 1.5)")
Now we fit the data with a parameter for each actor and each possible treatment.
b10.1 <-
brm(data = df, family = bernoulli,
pulled_left ~ 0 + .,
prior = prior(normal(0, 1.5), class = b),
refresh = 0, cores = 4)
summary(b10.1)
Family: bernoulli
Links: mu = logit
Formula: pulled_left ~ 0 + (treatment_L.N + treatment_L.P + treatment_R.N + treatment_R.P + actor_X1 + actor_X2 + actor_X3 + actor_X4 + actor_X5 + actor_X6 + actor_X7)
Data: df (Number of observations: 504)
Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup samples = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
treatment_L.N 0.88 0.50 -0.12 1.85 479 1.00
treatment_L.P 0.74 0.50 -0.22 1.71 440 1.00
treatment_R.N 0.28 0.50 -0.73 1.22 447 1.00
treatment_R.P -0.13 0.50 -1.13 0.82 448 1.00
actor_X1 -0.78 0.50 -1.76 0.21 456 1.00
actor_X2 3.66 0.82 2.18 5.48 1146 1.00
actor_X3 -1.08 0.52 -2.06 -0.04 465 1.00
actor_X4 -1.08 0.52 -2.09 -0.06 535 1.00
actor_X5 -0.78 0.51 -1.77 0.26 492 1.00
actor_X6 0.15 0.51 -0.83 1.18 475 1.00
actor_X7 1.66 0.57 0.59 2.82 582 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Below we examine our coefficients. We need to convert each parameter to the outcome scale. We can do this with brms::inv_logit_scaled. Note how antisocial actor 2 is.
post <- posterior_samples(b10.1) %>%
gather(parameter, value) %>%
mutate(value = inv_logit_scaled(value))
p_actors <- post %>%
filter(str_detect(parameter, "b_actor.*")) %>%
ggplot(aes(y=parameter, x=value)) +
tidybayes::geom_halfeyeh(fill = palette[3]) +
theme_burgyl()
p_treat <- post %>%
filter(str_detect(parameter, "b_treat.*")) %>%
ggplot(aes(y=parameter, x = value)) +
geom_halfeyeh(fill = palette[5]) +
theme_burgyl()
gridExtra::grid.arrange(p_actors, p_treat, ncol = 2)
We can now compare the constrasts in treatments.
posterior_samples(b10.1) %>%
select(starts_with("b_treat")) %>%
transmute(
` left lever` = (b_treatment_L.N - b_treatment_L.P),
` right lever` = (b_treatment_R.N - b_treatment_R.P)) %>%
gather(parameter, value) %>%
ggplot(aes(y = parameter, fill = parameter, x = value)) +
geom_halfeyeh(.width = c(.5, .95)) +
geom_vline(xintercept = 0, colour = alpha(palette[7], .6), linetype = 2) +
theme_burgyl() +
scale_fill_manual(values = c(palette[3], palette[5])) +
scale_x_continuous(breaks = round(seq(from=-.6, to = 1.4, by = .2),1)) +
theme(legend.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ylab("") + xlab("Log odds of pulling the left lever") +
ggtitle("Prosocial behavior is not statistically distinguishable")
When that same data is aggregated, we can switch to a binomial regression instead of a bernouli. Note the use of the criteria | trials() syntax.
library(rethinking)
data(chimpanzees)
d <- chimpanzees
detach("package:rethinking")
d_aggregated <- d %>%
select(-recipient, -block, -trial, -chose_prosoc) %>%
group_by(actor, condition, prosoc_left) %>%
summarise(x = sum(pulled_left))
d_aggregated %>%
filter(actor %in% c(1, 2))
loo is misleading in the aggregate case, specifically because you are leaving out aggregations at a time. Stick to the bernoulli if you want to use loo.
b10.5 <-
brm(data = d_aggregated, family = binomial,
x | trials(18) ~ 1 + prosoc_left + condition:prosoc_left ,
prior = c(prior(normal(0, 10), class = Intercept),
prior(normal(0, 10), class = b)),
iter = 2500, warmup = 500, cores = 2, chains = 2,
refresh = 0)
loo(b10.5)
Computed from 4000 by 28 log-likelihood matrix
------
Monte Carlo SE of elpd_loo is 0.1.
Pareto k diagnostic values:
Count Pct. Min. n_eff
(-Inf, 0.5] (good) 27 96.4% 489
(0.5, 0.7] (ok) 1 3.6% 687
(0.7, 1] (bad) 0 0.0% <NA>
(1, Inf) (very bad) 0 0.0% <NA>
All Pareto k estimates are ok (k < 0.7).
See help('pareto-k-diagnostic') for details.
Each row is a department-gender with the number of acceptances out of total applications. This is just an aggregated bernoulli that we will represent as independent binomial trials to collect the effect of gender.
library(rethinking)
data(UCBadmit)
d <- UCBadmit
detach(package:rethinking)
library(brms)
rm(UCBadmit)
d <-
d %>%
mutate(
male = ifelse(applicant.gender == "male", 1, 0),
female = -1*male + 1)
d %>%
glimpse
Observations: 12
Variables: 7
$ dept <fct> A, A, B, B, C, C, D, D, E, E, F, F
$ applicant.gender <fct> male, female, male, female, male, female, male, female, male, female, male, female
$ admit <int> 512, 89, 353, 17, 120, 202, 138, 131, 53, 94, 22, 24
$ reject <int> 313, 19, 207, 8, 205, 391, 279, 244, 138, 299, 351, 317
$ applications <int> 825, 108, 560, 25, 325, 593, 417, 375, 191, 393, 373, 341
$ male <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0
$ female <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
Here is the model we are running. We use a relatively flat prior on the outcome scale. \[ admission \sim Binomial(N_i, p_i) \\ logit(p_i) = \alpha * male + \beta * female \\ \alpha, \beta \sim Normal(0, 1.5) \]
b10.6 <-
brm(data = d, family = binomial,
admit | trials(applications) ~ 0 + male + female,
prior = c(prior(normal(0, .15), class = b)),
iter = 2500, warmup = 500, cores = 4, chains = 4,
refresh = 0, sample_prior = "yes")
plot(b10.6)
prior_samples(b10.6) %>%
gather(parameter) %>%
ggplot(aes(y = parameter, x = inv_logit_scaled(value))) +
geom_halfeyeh(.width = c(.95, .5),
fill = palette[5],
point_interval = median_hdi) +
theme_burgyl() +
ggtitle("Weakly regularizing prior for coefficients",
"95% and 50% highest density interval about the median")
posterior_samples(b10.6) %>%
select(starts_with("b_")) %>%
gather(parameter) %>%
ggplot(aes(value, parameter, fill = parameter)) +
geom_halfeyeh() +
theme_burgyl("tl") + ylab("") +
scale_fill_manual(values = c(palette[3], palette[6])) +
theme(legend.title = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
Now lets compute the contrasts.
posterior_samples(b10.6) %>%
transmute(
`logit scale` = b_male - b_female,
`outcome scale` = inv_logit_scaled(b_male) - inv_logit_scaled(b_female)) %>%
gather(parameter) %>%
ggplot(aes(y = parameter, x = value, fill = parameter)) +
geom_halfeyeh(point_interval = median_hdi,
.width = c(.95, .5)) +
ggtitle("Men have a 12% higher acceptance rate than women",
"95% and 50% highest density interval about the median") +
theme_burgyl("tl") + ylab("") +
scale_fill_manual(values = c(palette[3], palette[6])) +
theme_burgyl() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
I personally find that the interpretation is more interpretable on the outcome scale.
spread_draws(b10.6, b_male, b_female) %>%
transmute(diff = inv_logit_scaled(b_male) - inv_logit_scaled(b_female)) %>%
median_hdi()
What did we just do here? Recall that this our formula for a logistic regression. \[ logit(p) = -.2062730 * is\_male + -.7469153 * is\_female \]
For each gender, we predict the following probabilities. That 12 is the difference in percentage points! Not that males are 12% mroe likely to get admitted!
spread_draws(b10.6, b_male, b_female) %>%
select(starts_with("b_")) %>%
gather(parameter, value) %>%
mutate(value = inv_logit_scaled(value)) %>%
ggplot(aes(y = parameter, x = value)) +
tidybayes::stat_intervalh(point_interval = median_hdi)
Finally, lets perform a posterior prediction check. It turns out we are pretty off.
The problem in this case is that males and females do not apply to the same departments, and departments vary in their rates of admission. This makes the answer misleading. You can see the steady decline in admission probability for both males and females from department A to department F. Females in these data tended not to apply to departments like A and B, which had high overall admission rates. Instead they applied in large numbers to departments like F, which admitted less than 10% of applicants. So while it is true overall that females had a lower probability of admission in these data, it is clearly not true within most departments.
d %>%
mutate(prediction = predict(b10.6)[,1],
residuals = admit - prediction) %>%
group_by(dept) %>%
summarise(resid = mean(residuals)) %>%
arrange(desc(abs(resid)))
What we will now do is include variables for each department. That way the gender variables will only provide marginal information on top of the baseline acceptance rates of each department.
summary(b10.8)
Family: binomial
Links: mu = logit
Formula: admit | trials(applications) ~ 0 + male + female + dept_A + dept_B + dept_C + dept_D + dept_E + dept_F
Data: df (Number of observations: 12)
Samples: 4 chains, each with iter = 2500; warmup = 500; thin = 1;
total post-warmup samples = 8000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
male -0.35 0.06 -0.48 -0.22 3106 1.00
female -0.43 0.07 -0.57 -0.29 3698 1.00
dept_A 0.79 0.08 0.64 0.95 4189 1.00
dept_B 0.68 0.09 0.51 0.85 5141 1.00
dept_C -0.18 0.08 -0.34 -0.03 4596 1.00
dept_D -0.22 0.08 -0.38 -0.06 4559 1.00
dept_E -0.50 0.09 -0.68 -0.32 5368 1.00
dept_F -1.36 0.09 -1.55 -1.17 5103 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Lets compute the contrast again. The difference in acceptance rates is no longer significant. What happened?
posterior_samples(b10.8) %>%
transmute(male = inv_logit_scaled(b_male),
female = inv_logit_scaled(b_female),
diff = male - female) %>%
select(diff) %>% gather(parameter) %>%
ggplot(aes(y=parameter, x=value)) +
geom_eyeh(fill = palette[6],
.width = c(.5, .95)) +
geom_vline(xintercept = 0, linetype = 2) +
theme_burgyl()
Department is a confound. It opens up a backdoor pipe to acceptance rates.
library(dagitty)
dag <- dagitty("dag {
gender -> acceptance
department -> acceptance
gender -> department
}")
plot(dagitty::graphLayout(dag))
When more than two types of unordered events are possible, and the probability of each type of event is constant across trials, then the maximum entropy distribution is the multinomial distribution. Modeling this type of distribution can be done by either using a generalization of the logit link or by transforming the multinomial likelihood into a series of Poisson likelihoods.
The multinomial logit link function takes a vector of scores, one for each event type, and computes the probability of a specific event, k.
\[ Pr(k|s_1, s_2, \dots, s_K) = \frac {exp(s_k)} {\sum^K_{i=1}exp(s_i)} \]
In a multinomial GLM, you need to build K-1 linear models for K events, each of which can (1) use any predictor parameters it would like and (2) get assigned any predictor values it wants.
Example: Predicting career choice as a function of income. Here \(\beta_{INCOME}\) appears in each linear model for each possible career choice.
N <- 100
# simulate family incomes for each individual
family_income <- runif(N)
# assign a unique coefficient for each type of event
b <- (1:-1)
career <- rep(NA, N) # empty vector of choices for each individual
for ( i in 1:N ) {
score <- 0.5*(1:3) + b*family_income[i]
p <- softmax(score[1],score[2],score[3])
career[i] <- sample( 1:3 , size=1 , prob=p )
}
data_frame(
career,
family_income
)
Alternatively, you can specify brand new parameters for your linear models. Interpretation is very difficult for models like these.
m10.17 <- map(
alist(
career ~ dcategorical(softmax(0,s2,s3)),
s2 <- a2 + b2*family_income,
s3 <- a3 + b3*family_income,
c(a2, a3, b2, b3) ~ dnorm(0,5)
), data=list(career=career,family_income=family_income))
m10.17
Quadratic approximate posterior distribution
Formula:
career ~ dcategorical(softmax(0, s2, s3))
s2 <- a2 + b2 * family_income
s3 <- a3 + b3 * family_income
c(a2, a3, b2, b3) ~ dnorm(0, 5)
Posterior means:
a2 a3 b2 b3
-0.03633395 0.55688952 -0.52983875 -1.63672335
Log-likelihood: -106.93
Often times, the upper limit of your binomial, \(n\), is unknown. In these cases you’d want to use a poisson distriution. Recall that a binomial distributions mean is \(Np\) and its variance is \(Np(1-p)\). When N is very large and p is very small then these two are approximately the same.
y <- rbinom(1e5, 1000, 1/1000)
tibble(
mean = round(mean(y), 2),
var = round(var(y), 2))
This unified parameter is represented by the poisson distribution’s sole parameter \(\lambda\) which is the expected value of the outcome y. It is also the expected variance of the counts of y. The conventional link function for a Poisson model is the log link, which ensures our count is always positive. \[ y_i \sim Poisson(\lambda_i) \\ log(\lambda_i) + \alpha + \beta(x_i - \bar x) \] The log link also implies an exponential relationship between predictors and the expected value.
Here we examine how population size determines complexity of tools developed.
suppressPackageStartupMessages(library(rethinking))
data(Kline)
d <- as_data_frame(Kline)
rm(Kline)
detach("package:rethinking")
df <- d %>%
mutate(
log_pop = scale(log(population), center = TRUE),
contact_high = ifelse(contact == "high", 1, 0),
contact_low = ifelse(contact == "low", 1, 0))
df %>%
select(culture, population, log_pop,
contact_high, contact_low, total_tools) %>%
skimr::skim()
Skim summary statistics
n obs: 10
n variables: 6
── Variable type:character ──────────────────────────────────────────────────────────────────────────────
variable missing complete n min max empty n_unique
log_pop 0 10 10 15 19 0 10
── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────
variable missing complete n n_unique top_counts ordered
culture 0 10 10 10 Chu: 1, Haw: 1, Lau: 1, Mal: 1 FALSE
── Variable type:integer ────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
population 0 10 10 34109.1 84793.03 1100 3897.75 7700 12050 275000 ▇▁▁▁▁▁▁▁
total_tools 0 10 10 34.8 17.85 13 22.5 30.5 42.25 71 ▇▇▇▃▃▃▁▃
── Variable type:numeric ────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
contact_high 0 10 10 0.5 0.53 0 0 0.5 1 1 ▇▁▁▁▁▁▁▇
contact_low 0 10 10 0.5 0.53 0 0 0.5 1 1 ▇▁▁▁▁▁▁▇
Here is the model we’ll fit: \[ total\_tools \sim Poisson(\lambda_i) \\ log(\lambda_i) = \alpha_{contact{[i]}} + \beta_{contact[i]}*log(population_i) \\ \alpha_j \sim normal(3, .5) \\ \beta_j \sim normal(0, .2) \]
We chose our priors to make sure that they are sensible in the outcome space. The purpose of our more spiked beta prior is to dissuade our model from believing explosive exponential relationships.
tibble(
alpha = rnorm(1e5, 3, .5),
beta = rnorm(1e5, 0, .2)
) %>%
gather %>%
mutate(value = exp(value)) %>%
filter(!is.na(value)) %>%
ggplot(aes(y=key, x=value)) +
geom_halfeyeh(fill = palette[5]) +
coord_cartesian(xlim = c(0, 50)) +
theme_burgyl() + ylab("Prior") + xlab("exp(prior)")
b10.9 <-
brm(data = df, family = poisson,
total_tools ~ 1,
prior = c(prior(normal(3, .5), class = Intercept)),
chains = 4, cores = 4, sample_prior = "yes", refresh = 0)
Compiling the C++ model
Start sampling
b10.9
Family: poisson
Links: mu = log
Formula: total_tools ~ 1
Data: df (Number of observations: 10)
Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup samples = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
Intercept 3.54 0.05 3.43 3.65 1678 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
df <- df %>%
mutate(log_pop = log(population))
b10.10 <-
brm(data = df, family = poisson,
total_tools ~ 0 + contact_low + contact_high + log_pop + contact_high:log_pop,
prior = c(prior(normal(0, 1), class = b)),
chains = 4, cores = 4, refresh = 0, sample_prior = "yes")
Compiling the C++ model
Start sampling
b10.10
Family: poisson
Links: mu = log
Formula: total_tools ~ 0 + contact_low + contact_high + log_pop + contact_high:log_pop
Data: df (Number of observations: 10)
Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup samples = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
contact_low 0.83 0.34 0.17 1.47 1268 1.00
contact_high 0.22 0.81 -1.37 1.77 1494 1.00
log_pop 0.27 0.03 0.21 0.34 1298 1.00
contact_high:log_pop 0.10 0.09 -0.08 0.29 1386 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
loo.10.10 <- loo(b10.10, save_psis = TRUE, cores = 2)
Found 1 observations with a pareto_k > 0.7 in model 'b10.10'. It is recommended to set 'reloo = TRUE' in order to calculate the ELPD without the assumption that these observations are negligible. This will refit the model 1 times to compute the ELPDs for the problematic observations directly.
print(loo.10.10)
Computed from 4000 by 10 log-likelihood matrix
------
Monte Carlo SE of elpd_loo is NA.
Pareto k diagnostic values:
Count Pct. Min. n_eff
(-Inf, 0.5] (good) 9 90.0% 486
(0.5, 0.7] (ok) 0 0.0% <NA>
(0.7, 1] (bad) 1 10.0% 155
(1, Inf) (very bad) 0 0.0% <NA>
See help('pareto-k-diagnostic') for details.
plot(loo.10.10, label_points = TRUE)
td <- tibble(
population = seq(from = 0, to = 275000, length.out = 250)) %>%
expand(nesting(population), contact = c("low", "high")) %>%
mutate(
log_pop = log(population),
contact_high = ifelse(contact == "high", 1, 0),
contact_low = ifelse(contact == "low", 0, 1))
td <- td %>%
bind_cols(
as_data_frame(
predict(b10.10, newdata = td))) %>%
filter(!is.nan(Estimate))
NAs producedNAs produced
p1 <- td %>%
ggplot(aes(x = population, y = Estimate,
ymin = `Q2.5`, ymax = `Q97.5`,
group = contact)) +
geom_ribbon(colour = "transparent", alpha = .6,
aes(fill = contact)) +
geom_line() +
geom_point(data = d, inherit.aes = FALSE,
mapping = aes(x = population, y = total_tools,
shape = contact, colour = contact)) +
theme_burgyl("tl") + ylab("Predicted Total Tools") +
scale_fill_manual(values = c(palette[3], palette[5])) +
scale_color_manual(values = c(palette[6], palette[7])) #+
# ggtitle("Posterior prediction plot",
# "total_tools ~ contact_low + contact_high + log_pop + contact_high:log_pop")
p2 <- td %>%
ggplot(aes(x = log_pop, y = Estimate,
ymin = `Q2.5`, ymax = `Q97.5`,
group = contact)) +
geom_ribbon(colour = "transparent", alpha = .6,
aes(fill = contact)) +
geom_line() +
geom_point(data = d, inherit.aes = FALSE,
mapping = aes(x = log(population), y = total_tools,
shape = contact, colour = contact)) +
theme_burgyl() + ylab("") + xlab("log(population)") +
theme(legend.position="none") +
scale_fill_manual(values = c(palette[3], palette[5])) +
scale_color_manual(values = c(palette[6], palette[7]))
gridExtra::grid.arrange(p1, p2, ncol = 2,
top = "Posterior prediction plot")
The rate parameter, \(\lambda\), of a Poisson distribution can be normalized across different exposure frequencies. McElreath gives the example of combining data from a daily log and a weekly log.
\[ y_i \sim Poisson(\lambda_i) \\ log(\lambda_i) = log(\frac{\mu_i}{\tau_i}) = \alpha + \beta x_i \\ log(\mu_i) = log(\tau_i) + \alpha + \beta x_i \]
\(\tau\) is just a column of data, often referred to as the offset. Lets simulate an example to show how this works where one monestary records its manuscript completions daily while another does so weekly.
num_days <- 30
num_weeks <- 4
d <- data_frame(
manuscripts = c(rpois(num_days, 1.5),
rpois(num_weeks, .5 * 7)),
num_days = c(rep(1, 30), rep(7, 4)),
monastery_id = c(rep(0, 30), rep(1, 4)))
d <- d %>%
mutate(
log_num_days = log(num_days),
monastary_1 = as.numeric(monastery_id == 0),
monastary_2 = as.numeric(monastery_id == 1))
d %>%
tail(10)
Now lets fit the model.
b10.15 <-
brm(data = d, family = poisson,
manuscripts ~ 0 + offset(log_num_days) + monastary_1 + monastary_2,
prior = c(prior(normal(0, 1), class = b)),
sample_prior = "yes", refresh = 0,
iter = 2500, warmup = 500, cores = 2, chains = 2)
Compiling the C++ model
recompiling to avoid crashing R session
Start sampling
b10.15
Family: poisson
Links: mu = log
Formula: manuscripts ~ 0 + offset(log_num_days) + monastary_1 + monastary_2
Data: d (Number of observations: 34)
Samples: 2 chains, each with iter = 2500; warmup = 500; thin = 1;
total post-warmup samples = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
monastary_1 0.32 0.15 0.01 0.61 2527 1.00
monastary_2 -0.56 0.24 -1.06 -0.12 2753 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
rates <- fixef(b10.15) %>%
as_data_frame %>%
pull(Estimate) %>%
exp
subtitle <- glue("Monastary 2 produces manuscripts {round(rates[1] / rates[2], 1)} times as fast as monastary 1")
posterior_samples(b10.15) %>%
select(starts_with("b_")) %>%
transmute(rate_diff = exp(b_monastary_1) / exp(b_monastary_2)) %>%
gather(Parameter, Value) %>%
ggplot(aes(y = Parameter, x = Value)) +
geom_vline(xintercept = rates[1] / rates[2]) +
tidybayes::stat_intervalh(.width = c(.9, .95, .99)) +
geom_vline(xintercept = 0, linetype = 2) +
coord_cartesian(xlim = c(0, 6)) +
theme_bw() + theme(panel.grid = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = c(1, 1),
legend.justification = c(1, 1),
legend.background = element_rect(fill = "transparent")) +
ylab("") + xlab("Times increase in manuscript creation rate") +
annotate("text", x = 3.1, y = 1.4,
label = glue::glue(
"Median HDPI: {round(rates[1] / rates[2], 1)}")) +
ggtitle("Monastary 2 creates manuscripts significantly faster than Monastary 1")
suppressPackageStartupMessages(library(rethinking))
data(UCBadmit)
df <- as_data_frame(UCBadmit)
detach("package:rethinking")
df
b.binom <- brm(admit | trials(applications) ~ 1,
df, binomial,
prior = prior(normal(0, 100), class = "Intercept"),
refresh = 0)
b.binom
Family: binomial
Links: mu = logit
Formula: admit | trials(applications) ~ 1
Data: df (Number of observations: 12)
Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup samples = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
Intercept -0.46 0.03 -0.52 -0.40 1387 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
summary(m_pois)
Inference for Stan model: 62dbe67ebbb91aa25dccdb88353bb466.
3 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=3000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
a1 4.99 0.00 0.02 4.94 4.97 4.99 5.00 5.03 2100 1
a2 5.44 0.00 0.02 5.40 5.43 5.44 5.46 5.48 2208 1
lp__ 19302.17 0.03 0.97 19299.64 19301.76 19302.45 19302.89 19303.14 1208 1
Samples were drawn using NUTS(diag_e) at Sat Feb 16 16:07:48 2019.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
k <- as.numeric(coef(m_pois))
tibble(
binom_fit = inv_logit_scaled(fixef(b.binom))[,1],
pois_fit = exp(k[1]) / (exp(k[1]) + exp(k[2])))
NA
Survival models measure displacement, which are continuous deviations from some point of reference. Examples of this are durations (time to discharge). When all we know is the average displacement, the exponential disgribution is the maximum entropy distribution. Alternatively, the gamma distribution is the maximum entropy distribution for fixed mean value and fixed mean magnitude.
Suvival models also deal with censoring, which is when some other event happens and gets in the way of measuring the event of interest (death instead of discharge).
Lets examine cats in Austin where we are interested in the time until their adoption. We have one entry per cat in the data set.
suppressPackageStartupMessages(library(rethinking))
data(AustinCats)
d <- as_data_frame(AustinCats) %>%
mutate(adopt = ifelse(out_event == "Adoption", 1L, 0L))
rm(AustinCats)
detach("package:rethinking")
(d %>%
transmute(id,
adopt,
days_to_event,
is_black = as.integer(color=="Black")) ->
d)
We’ll run the following model to predict the number of days. For adopted cats, the days to adoption is exponentially distributed and easy to encode. For non-adopted cats, we employ the cumulative probability distribution which gives the proportion of cats adopted before or at a certain number of days. Therefore, \(1 - cumulative\_distribution\) gives the proportion of cats not yet adopted at day \(d\). This is called the complementary cumulative probability distribution and is displayed in the second line below.
\[ D_i | A_i = 1 \sim Exponential(\lambda_i) \\ D_i | A_i = 0 \sim ExponentialCCDF(\lambda_i) \\ \lambda_i = \frac{1}{\mu_i} \\ log \mu_i = \alpha_{CID[i]} \]
Lets run this in STAN.
post <- as_data_frame(extract.samples(m11.14)$a) %>%
rename(black = V1,
other = V2)
post %>%
gather(colour) %>%
mutate(value = exp(value)) %>%
ggplot(aes(x = value, y = colour)) +
geom_halfeyeh(.width = c(.95, .5)) +
theme(panel.background = element_rect(fill = "transparent",
colour = "black")) +
xlab("Days to adoption") + ylab("Cat colour")
First lets gather and preprocess the data.
library(MASS)
data(eagles)
df <- as_data_frame(eagles)
rm(eagles)
detach("package:MASS")
df <- df %>%
transmute(
successes = y,
attempts = n,
pirate_size = ifelse(P == 'L', "large", "small"),
adult = ifelse(A == "I", "immature", "adult"),
victim_size = ifelse(V == "L", "large", "small"))
library(recipes)
df <- df %>%
recipe(successes ~ .) %>%
step_dummy(pirate_size, adult, victim_size, one_hot = TRUE) %>%
prep %>%
juice
df
We drop the intercept and include all factor levels of our binary variables.
hypothesis <- bf(successes | trials(attempts) ~ 0 + .)
get_prior(hypothesis, family = binomial, data = df)
Our stan estimates look solid so lets just run with those.
fit.10h3 <- brm(family = binomial, data = df,
formula = hypothesis,
prior = c(prior(normal(0, 4), class = b)),
chains = 4, cores = 4, refresh = 0)
plot(fit.10h3)
Lets take a look at our estimates on the outcome scale.
posterior_samples(fit.10h3) %>%
select(starts_with("b_")) %>%
gather(parameter, value) %>%
mutate(value = inv_logit_scaled(value)) %>%
ggplot(aes(y=parameter, x = value)) +
geom_halfeyeh(.width = c(.5, .95))
as_data_frame(predict(fit.10h3, newdata = df)) %>%
bind_cols(index=1:nrow(df), df, .) %>%
ggplot(aes(x = index, colour = as.factor(pirate_size_large), shape = as.character(adult_adult))) +
geom_crossbar(aes(y = Estimate/attempts, ymin = `Q2.5`/attempts, ymax = `Q97.5`/attempts)) +
geom_point(aes(y = successes/attempts), size = 3) +
theme(panel.grid = element_blank())
NA
NA
NA
fit.10h3.2 <- update(fit.10h3, refresh = 0,
formula = bf(successes | trials(attempts) ~ 1 + adult_adult +
pirate_size_large +
victim_size_large +
adult_adult:pirate_size_large))
fit.10h3.2
Family: binomial
Links: mu = logit
Formula: successes | trials(attempts) ~ adult_adult + pirate_size_large + victim_size_large + adult_adult:pirate_size_large
Data: df (Number of observations: 8)
Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup samples = 4000
Population-Level Effects:
Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
Intercept -0.60 1.01 -2.76 1.25 1940 1.00
adult_adult 3.08 1.17 0.90 5.44 1620 1.00
pirate_size_large 6.06 1.32 3.64 8.84 1561 1.00
victim_size_large -4.96 1.06 -7.30 -3.12 1790 1.00
adult_adult:pirate_size_large -2.59 1.29 -5.18 -0.08 1634 1.00
Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
is a crude measure of effective sample size, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.10h3.2)
waic(fit.10h3, fit.10h3.2)
WAIC SE
fit.10h3 30.95 8.43
fit.10h3.2 21.33 5.91
fit.10h3 - fit.10h3.2 9.62 3.28
loo(fit.10h3, fit.10h3.2, reloo = TRUE)
LOOIC SE
fit.10h3 42.69 13.22
fit.10h3.2 28.23 8.83
fit.10h3 - fit.10h3.2 14.46 4.74
First lets load the data and take a look at it.
library(rethinking)
data(salamanders)
df <- as_data_frame(salamanders)
rm(salamanders)
detach("package:rethinking")
df <- df %>%
transmute(
site = SITE,
num_salaman = SALAMAN,
pct_cover = PCTCOVER,
age_trees = FORESTAGE)
df %>% skimr::skim()
Skim summary statistics
n obs: 47
n variables: 4
── Variable type:integer ─────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
age_trees 0 47 47 168.81 193.51 0 29.5 64 266.5 675 ▇▁▂▁▁▁▁▁
num_salaman 0 47 47 2.47 3.37 0 0 1 3 13 ▇▂▁▁▁▁▁▁
pct_cover 0 47 47 58.98 35.78 0 18 83 88 93 ▃▁▁▁▁▁▁▇
site 0 47 47 24 13.71 1 12.5 24 35.5 47 ▇▇▇▇▇▇▇▇
Now for some preprocessing. We want to try a few different transformations of our predictors.
dm <- df %>%
mutate(log_age_trees = log(age_trees)) %>%
filter(is.finite(log_age_trees)) %>%
mutate(
pct_cover_cs = scale(pct_cover),
log_age_trees_cs = scale(log_age_trees),
age_trees_cs = scale(age_trees))
dm %>% head
Interestingly, the best performing model only uses the percent of coverage. It appears the age of trees adds too much noise and does not generalize well.
fit.10h.4.1 <- brm(num_salaman ~ 1, refresh = 0,
family = poisson(), data = dm,
prior = c(
prior(normal(0, 1), class = "Intercept")),
sample_prior = "yes", refresh = 0,
chains = 4, cores = 4)
fit.10h.4.2 <- update(fit.10h.4.1, newdata = dm, refresh = 0,
num_salaman ~ 1 + pct_cover_cs,
prior = c(
prior(normal(0, 1), class = "Intercept"),
prior(normal(0, .5), class = "b")))
fit.10h.4.3 <- update(fit.10h.4.2, newdata = dm, refresh = 0,
num_salaman ~ 1 + pct_cover_cs + age_trees_cs)
fit.10h.4.4 <- update(fit.10h.4.3, newdata = dm, refresh = 0,
num_salaman ~ 1 + pct_cover_cs + log_age_trees_cs)
loo(fit.10h.4.1, fit.10h.4.2, fit.10h.4.3, fit.10h.4.4,
reloo = TRUE)
LOOIC SE
fit.10h.4.1 275.75 34.64
fit.10h.4.2 212.76 26.47
fit.10h.4.3 217.02 27.17
fit.10h.4.4 216.74 27.28
fit.10h.4.1 - fit.10h.4.2 62.98 19.24
fit.10h.4.1 - fit.10h.4.3 58.73 18.83
fit.10h.4.1 - fit.10h.4.4 59.01 18.81
fit.10h.4.2 - fit.10h.4.3 -4.25 1.39
fit.10h.4.2 - fit.10h.4.4 -3.97 1.05
fit.10h.4.3 - fit.10h.4.4 0.28 1.08
We can interpret the winning model like so:
fixef(fit.10h.4.2)[,1] %>%
exp
Intercept pct_cover_cs
1.709934 2.753260
posterior_samples(fit.10h.4.2) %>%
select(starts_with("b_"), starts_with("prior")) %>%
gather(parameter, value) %>%
mutate(value = exp(value)) %>%
ggplot(aes(y=parameter, x = value)) +
geom_halfeyeh() +
theme_bw() +
ggtitle("Posterior distribution for effect estimates")
Just to gut check my bayesian findings, lets prove that the simpler model works
library(rsample)
library(yardstick)
formula.int <- formula(num_salaman ~ 1)
formula.cover.cs <- formula(num_salaman ~ 1 + pct_cover_cs)
formula.cover.trees <- formula(num_salaman ~ 1 + pct_cover_cs + age_trees)
formula.cover.logtrees <- formula(num_salaman ~ 1 + pct_cover_cs + log_age_trees)
formula.covercs.logtreescs <- formula(num_salaman ~ 1 + pct_cover_cs + log_age_trees_cs)
fit_model <- function(splits, formula) {
fit.none <- glm(formula,
family = poisson(), data = analysis(splits))
holdout <- assessment(splits)
preds <- predict(fit.none, newdata = holdout)
rmse_vec(holdout$num_salaman, preds)
}
bootstraps(dm, times = 1000) %>%
mutate(
intercept = map_dbl(splits, ~ fit_model(.x, formula.int)),
cover_cs = map_dbl(splits, ~ fit_model(.x, formula.cover.cs)),
cover_trees = map_dbl(splits, ~ fit_model(.x, formula.cover.trees)),
cover_logtrees = map_dbl(splits, ~ fit_model(.x, formula.cover.logtrees)),
covercs_logtreescs = map_dbl(splits, ~ fit_model(.x, formula.covercs.logtreescs))) %>%
as.data.frame %>%
select(-splits, -id) %>%
gather(model, rmse) %>%
group_by(model) %>%
mean_hdi() %>%
arrange(rmse)